from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
import random
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
# 10대 매출비율,20대 매출비율 평균영업개월수 점포수 총 매출액
data=pd.read_csv("plus_living.csv")
data["total_living_people"]=data["total_living_people"] /1000
data["idx"]=np.arange(31547)
del data['Unnamed: 0']
del data['cm_code_name']
st_data=pd.DataFrame(data,columns=["10's_sales_rate","20's_sales_rate","30's_sales_rate","40's_sales_rate","50's_sales_rate","60's_sales_rate","simillar_store_number"])
# standardScaler 함수 사용하여 data 스케일링, 에러는 한글때문임, 한글 외에 모든 데이터 스케일링 됌
def standardScaler(header_list):
over_array = st_data[header_list]
a = pd.DataFrame(over_array)
a["idx"] = np.arange(31547)
train_data = a.values
standardScaler = StandardScaler()
print(standardScaler.fit(train_data))
train_data_standardScaled = standardScaler.transform(train_data)
st_df = pd.DataFrame(train_data_standardScaled, columns=['standard_over','B'])
del st_df['B']
st_data[header_list] = st_df
# columns들중에서 선별하여 스케일링 수행
header = list(data)
header
for a in header:
if(a == "10's_sales_rate" or a == "20's_sales_rate" or a == "30's_sales_rate" or a == "40's_sales_rate" or a == "50's_sales_rate" or a == "60's_sales_rate" or a == "simillar_store_number"):
standardScaler("{}".format(a))
#clustering
test_kmeans=pd.DataFrame(st_data,columns=["10's_sales_rate","20's_sales_rate","30's_sales_rate","40's_sales_rate","50's_sales_rate","60's_sales_rate","simillar_store_number"])
data_points=test_kmeans.values
kmeans=KMeans(n_clusters=5).fit(data_points)
test_kmeans['cluster_id']=kmeans.labels_
df = test_kmeans.copy()
test_kmeans["idx"] = np.arange(31547)
test_kmeans=pd.DataFrame(test_kmeans,columns=["cluster_id","idx"])
new=pd.merge(data,test_kmeans,on=["idx"])
del new["idx"]
test0 = rateof_1020_business_stnum0=new[new['cluster_id']==0]
test1 = rateof_1020_business_stnum1=new[new['cluster_id']==1]
test2 = rateof_1020_business_stnum2=new[new['cluster_id']==2]
test3 = rateof_1020_business_stnum3=new[new['cluster_id']==3]
test4 = rateof_1020_business_stnum4=new[new['cluster_id']==4]
Mean = pd.DataFrame({'test0' : test0.mean(), 'test1' : test1.mean(), 'test2' : test2.mean(), 'test3' : test3.mean(), 'test4' : test4.mean()})
Meandf = Mean.T
A = pd.DataFrame(Meandf , columns = ["act_jipyo_value", 'growth_jipyo_value', 'safety_jipyo_value'])
A = A.T
Mean
A.plot(figsize=(12, 4), legend=True, fontsize=15)
Mean = pd.DataFrame({'test0' : test0.mean(), 'test1' : test1.mean(), 'test2' : test2.mean(), 'test3' : test3.mean(), 'test4' : test4.mean()})
Meandf = Mean.T
A2 = pd.DataFrame(Meandf , columns = ["10's_sales_rate", "20's_sales_rate", "30's_sales_rate","40's_sales_rate","50's_sales_rate","60's_sales_rate"])
A2 = A2.T
A2.plot(figsize=(12, 4), legend=True, fontsize=15)
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
code_name = pd.read_csv("code_name.csv")
test0 = pd.merge(test0 , code_name, on="service_code")
test1 = pd.merge(test1 , code_name, on="service_code")
test2 = pd.merge(test2 , code_name, on="service_code")
test3 = pd.merge(test3 , code_name, on="service_code")
test4 = pd.merge(test4 , code_name, on="service_code")
plt.rcParams.update({'font.size': 40})
fig = plt.figure(figsize=(50,50))
ax1 = fig.add_subplot(321)
df0 = test0.service_code_name.value_counts()
df0=df0.head(5)
df0.plot.pie(autopct='%.2f%%')
plt.title("service_code")
plt.axis('equal')
ax2 = fig.add_subplot(322)
df1 = test1.service_code_name.value_counts()
df1=df1.head(5)
df1.plot.pie(autopct='%.2f%%')
plt.title("service_code")
plt.axis('equal')
ax3 = fig.add_subplot(323)
df2 = test2.service_code_name.value_counts()
df2=df2.head(5)
df2.plot.pie(autopct='%.2f%%')
plt.title("service_code")
plt.axis('equal')
ax4 = fig.add_subplot(324)
df3 = test3.service_code_name.value_counts()
df3=df3.head(5)
df3.plot.pie(autopct='%.2f%%')
plt.title("service_code")
plt.axis('equal')
ax5 = fig.add_subplot(325)
df4 = test4.service_code_name.value_counts()
df4=df4.head(5)
df4.plot.pie(autopct='%.2f%%')
plt.title("service_code")
plt.axis('equal')
plt.show()
import seaborn as sns
plt.rcParams.update({'font.size': 18})
fig = plt.figure(figsize=(50,50))
sns.lmplot('cluster_id',"10's_sales_rate",data=new,fit_reg=False,
scatter_kws={"s":1},hue="cluster_id")
plt.title("10's_sales_rate")
sns.lmplot('cluster_id',"20's_sales_rate",data=new,fit_reg=False,
scatter_kws={"s":1},hue="cluster_id")
plt.title("20's_sales_rate")
sns.lmplot('cluster_id',"30's_sales_rate",data=new,fit_reg=False,
scatter_kws={"s":1},hue="cluster_id")
plt.title("30's_sales_rate")
sns.lmplot('cluster_id',"40's_sales_rate",data=new,fit_reg=False,
scatter_kws={"s":1},hue="cluster_id")
plt.title("40's_sales_rate")
sns.lmplot('cluster_id',"50's_sales_rate",data=new,fit_reg=False,
scatter_kws={"s":1},hue="cluster_id")
plt.title("50's_sales_rate")
sns.lmplot('cluster_id',"60's_sales_rate",data=new,fit_reg=False,
scatter_kws={"s":1},hue="cluster_id")
plt.title("60's_sales_rate")
from sklearn.decomposition import PCA
plt.rcParams['axes.unicode_minus'] = False
pca = PCA(n_components=3)
X_scaled = st_data
pca.fit(X_scaled)
plt.rcParams.update({'font.size': 10})
# 처음 두 개의 주성분을 사용해 데이터를 변환합니다
X_pca = pca.transform(X_scaled)
X_pca = pd.DataFrame(X_pca)
print("원본 데이터 형태: {}".format(str(X_scaled.shape)))
print("축소된 데이터 형태: {}".format(str(X_pca.shape)))
clusterid=pd.DataFrame(new,columns=["cluster_id"])
X_pca['cluster_id'] = clusterid
fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot( projection='3d')
x = X_pca[0]
y = X_pca[1]
z = X_pca[2]
ax.scatter(x,y,z, c=X_pca['cluster_id'], marker='o')
ax.set_xlabel('x axis')
ax.set_ylabel('y axis')
ax.set_zlabel('z axis')
plt.show()
import pandas_profiling as pp
pp.ProfileReport(test0)
pp.ProfileReport(test1)
pp.ProfileReport(test2)